{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MCT-LTDiag Dataset: Google Drive Download Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 37143,
     "status": "ok",
     "timestamp": 1757143929975,
     "user": {
      "displayName": "Haoyang Su",
      "userId": "12713581229047809553"
     },
     "user_tz": -480
    },
    "id": "yjbl-i4z4Bzz",
    "outputId": "345e8b0d-5910-4f02-e25f-473edc844d7b"
   },
   "outputs": [],
   "source": [
    "from google.colab import drive\n",
    "import os\n",
    "\n",
    "drive.mount('/content/drive')\n",
    "\n",
    "# Your target folder\n",
    "output = \"/content/drive/MyDrive/MCT_LTDiag/v_download\"\n",
    "os.makedirs(output, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "executionInfo": {
     "elapsed": 156389,
     "status": "error",
     "timestamp": 1757144667362,
     "user": {
      "displayName": "Haoyang Su",
      "userId": "12713581229047809553"
     },
     "user_tz": -480
    },
    "id": "317qXjHy-QIz",
    "outputId": "849d5d50-8ff3-466a-989a-60fd5576a75d"
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import os\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "\n",
    "API_TOKEN = \"\" # fill in your api token obtained through Harvard Dataverse\n",
    "PERSISTENT_ID = \"doi:10.7910/DVN/S3RW15\"\n",
    "SAVE_DIR = \"/content/drive/MyDrive/MCT_LTDiag/v_download\" # your own data storage dir\n",
    "os.makedirs(SAVE_DIR, exist_ok=True)\n",
    "\n",
    "headers = {\"X-Dataverse-key\": API_TOKEN}\n",
    "\n",
    "# obtain MCT_LTDiag dataset JSON\n",
    "dataset_url = f\"https://dataverse.harvard.edu/api/datasets/:persistentId?persistentId={PERSISTENT_ID}\"\n",
    "resp = requests.get(dataset_url, headers=headers)\n",
    "resp.raise_for_status()\n",
    "dataset_json = resp.json()\n",
    "\n",
    "files = dataset_json[\"data\"][\"latestVersion\"][\"files\"]\n",
    "\n",
    "def download_file(f):\n",
    "    file_name = f[\"dataFile\"][\"filename\"]\n",
    "    file_id = f[\"dataFile\"][\"id\"]\n",
    "    download_url = f\"https://dataverse.harvard.edu/api/access/datafile/{file_id}\"\n",
    "    save_path = os.path.join(SAVE_DIR, file_name)\n",
    "\n",
    "    if os.path.exists(save_path):\n",
    "        return f\"File {file_name} already exists, skipped\"\n",
    "\n",
    "    print(f\"Downloading {file_name} ...\")\n",
    "    r = requests.get(download_url, headers=headers, stream=True)\n",
    "    with open(save_path, \"wb\") as fd:\n",
    "        for chunk in r.iter_content(chunk_size=1024*1024):\n",
    "            if chunk:\n",
    "                fd.write(chunk)\n",
    "    return f\"Saved {file_name}\"\n",
    "\n",
    "max_workers = 5\n",
    "with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
    "    futures = [executor.submit(download_file, f) for f in files]\n",
    "    for future in as_completed(futures):\n",
    "        print(future.result())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import tarfile\n",
    "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
    "\n",
    "SAVE_DIR=\"/content/drive/MyDrive/MCT_LTDiag/v_download\" # your own data storage dir\n",
    "def extract_tar(file_path):\n",
    "    if file_path.endswith(\".tar\"):\n",
    "        print(f\"Extracting {file_path} ...\")\n",
    "        with tarfile.open(file_path, \"r\") as tar:\n",
    "            tar.extractall(path=f\"{SAVE_DIR}/{os.path.basename(file_path).replace('.tar','')}\")\n",
    "        os.remove(file_path)\n",
    "        return f\"Processed {file_path}\"\n",
    "    return f\"Skipped {file_path}\"\n",
    "\n",
    "tar_files = [os.path.join(SAVE_DIR, f) for f in os.listdir(SAVE_DIR) if f.endswith(\".tar\")]\n",
    "\n",
    "max_workers = os.cpu_count()\n",
    "with ProcessPoolExecutor(max_workers=max_workers) as executor:\n",
    "    futures = [executor.submit(extract_tar, f) for f in tar_files]\n",
    "    for future in as_completed(futures):\n",
    "        print(future.result())\n",
    "\n",
    "print(\"All .tar files have been extracted.\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyO6nQNxa1U9aU7T76yDIKOE",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
